#############################################################################
# Gene expression analysis of expression data from human bone
# marrow hematopoietic stem cells
# Organism  Homo sapiens
# Dataset: GSE32719 (http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE32719)
# Pang WW, Price EA, Sahoo D, Beerman I et al. Human bone marrow hematopoietic stem cells 
# are increased in frequency and myeloid-biased with age.
# Proc Natl Acad Sci U S A 2011 Dec 13;108(50):20012-7.
# PMID: 22123971 (http://www.ncbi.nlm.nih.gov/pubmed/22123971)
# R code: (unknown) double-blind review restriction
##############################################################################
##load required packages
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(biomaRt)
##load the expression data previously was extracted using GEOquery package from hsc_analyses_EG-07022020 code in the processing directory  and saved to GSE32719.expression.matrix.csv

GSEdata=read.csv("GSE32719.expression.matrix.csv",stringsAsFactors = FALSE)

geneIDs=GSEdata$X
conditions=colnames(GSEdata[,-1])

# Check the loaded dataset
dim(GSEdata) # Dimension of the dataset
## [1] 54675    28
head(GSEdata) # First few rows
##          X GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993
## 1   DDR1_1  7.884408  7.808434  7.487010  7.569611  7.250681  6.944877
## 2   RFC2_2  6.737823  6.437587  7.769109  7.368188  7.472075  8.483271
## 3  HSPA6_3  6.627552  6.682880  5.085364  5.515489  5.265894  5.621940
## 4   PAX8_4  7.415271  7.402765  7.685612  7.740810  7.665285  8.014437
## 5 GUCA1A_5  3.338770  3.644936  3.384022  3.611830  3.598129  3.245417
## 6   UBA7_6  9.916951  9.048261  9.833937  9.473971  9.261590  9.694111
##   GSM812994 GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000
## 1  7.421968  7.358658  7.173638  8.099701  7.894993  8.200512  8.199732
## 2  7.205830  7.752333  7.177863  6.904935  6.502177  7.506968  6.700267
## 3  5.774978  5.288154  5.317504  5.049701  5.321981  5.164705  5.256353
## 4  7.620498  8.117782  8.476150  7.328249  7.622319  7.614827  7.248452
## 5  4.010995  3.238467  3.533082  3.156613  3.127926  3.292835  3.491960
## 6  9.276572 10.254284 10.658883  9.638729  9.365365  9.479343  9.994390
##   GSM813001 GSM813002 GSM813003 GSM813004 GSM813005 GSM813006 GSM813007
## 1  7.791689  7.813888  7.901430  8.086425  8.290824  8.142938  5.958269
## 2  6.659768  7.013816  6.993406  6.393265  6.812322  6.579866  6.696320
## 3  5.296228  5.288170  5.520958  5.237739  5.193563  5.107821  4.900028
## 4  7.639622  7.522083  7.213168  7.333272  7.491317  7.482816  6.986802
## 5  3.582134  3.352848  3.469692  3.323915  3.343692  3.475047  3.496106
## 6  9.323403  9.699203  8.737581  9.942752  9.923929  9.534929  9.174486
##   GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013 GSM813014
## 1  7.752040  7.550791  7.621439  8.045653  8.009282  7.710599  7.814352
## 2  6.325255  6.480209  6.863048  6.416551  6.996908  6.809659  6.830948
## 3  5.382177  5.681456  5.188808  5.028203  5.184249  5.060479  4.818644
## 4  7.967097  7.621571  7.347287  7.573728  7.203639  7.520831  7.523141
## 5  4.571782  3.561773  3.322275  3.405920  3.632146  3.295618  3.587407
## 6  9.848087  9.833564 10.101484  9.965800 10.024517 10.030649  9.750748
tail(GSEdata) # Last few rows
##              X GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993
## 54670 NA_54670 13.770716 13.994500 11.288592 10.313899 13.538016 14.485988
## 54671 NA_54671  9.891545 10.054197  5.365113  5.982599  9.886868 11.924218
## 54672 NA_54672 12.321272 12.614288  8.244337  8.323074 12.091622 13.694000
## 54673 NA_54673  3.276871  3.351414  3.280466  3.272042  3.395136  3.671908
## 54674 NA_54674  4.238403  3.646006  3.704059  3.981665  4.032586  4.677945
## 54675 NA_54675  4.163933  3.836323  3.706053  4.101131  3.665141  4.500671
##       GSM812994 GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000
## 54670 14.520052 14.178241 14.078583 14.001334 13.973219 10.529092  9.162595
## 54671 11.868060 10.719133 10.790617  9.724162  9.221820  5.167606  4.645668
## 54672 13.714543 13.069249 12.974032 12.364099 12.008661  7.537253  6.374678
## 54673  3.585744  3.355240  3.767454  3.162290  3.132417  3.306447  3.208896
## 54674  4.093954  4.360997  4.115608  3.697179  3.816241  3.781706  3.673603
## 54675  4.624900  4.104248  4.173879  3.601409  3.598923  3.662857  3.511473
##       GSM813001 GSM813002 GSM813003 GSM813004 GSM813005 GSM813006 GSM813007
## 54670 11.017625 13.004120 13.714042  9.474458  9.719739  7.909547 13.485361
## 54671  6.761740  8.422473 10.464575  5.240622  4.334311  4.103851  9.764244
## 54672  7.622667 10.646550 12.351903  6.504358  6.635874  4.999693 11.984164
## 54673  3.450305  3.204290  3.283685  3.243439  3.126628  3.201200  3.421008
## 54674  3.352568  3.690452  3.890352  3.700332  3.783954  3.731791  3.547028
## 54675  3.675848  3.764469  3.912864  3.677868  3.552473  3.737098  3.860874
##       GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013 GSM813014
## 54670 13.542793 14.371288 14.101520 10.178728 12.701168 12.501800 11.059357
## 54671  9.354944 11.776899 10.798681  6.686006  7.583291  7.336464  6.463792
## 54672 12.126887 13.544123 12.895119  7.408262 10.641604 10.386612  8.562743
## 54673  3.545974  3.524391  3.546822  3.339191  3.466229  3.175700  3.244836
## 54674  3.698521  3.860318  4.038958  3.949230  3.748598  3.557370  3.849345
## 54675  3.745716  4.233577  3.806837  3.704331  3.631327  3.511816  3.504921
raw_data=data.matrix(GSEdata,rownames.force = NA)
## Warning in data.matrix(GSEdata, rownames.force = NA): NAs introduced by coercion
rownames(raw_data)=geneIDs

#write.csv(raw_data,"raw_data.csv")

data1 <- raw_data[,-1]
geneSymbols=gsub("_[0-9]*$", "", rownames(data1))
rownames(data1)=geneSymbols
data<-data1[!rownames(data1) %in% "NA", ]
###################
# Exploratory plots
###################

# Check the behavior of the data
#pdf("plots/histogramRawData.pdf")
hist(data, col = "gray", main="GSE32719 - Histogram")

#dev.off()

# Log2 transformation (why?)
data2 = log2(data)

# Check the behavior of the data after log-transformation
#pdf("plots/Log2histogramRawData.pdf")
hist(data2, col = "gray", main="GSE32719 (log2) - Histogram")

#dev.off()
# Boxplot
#pdf("plots\boxPlotRawData.pdf")
boxplot(data2, col=c("darkgreen", "darkgreen", "darkgreen","darkgreen","darkgreen",
                     "darkgreen", "darkgreen", "darkgreen","darkgreen","darkgreen",
                     "darkgreen", "darkgreen","darkgreen","darkred",
                     "darkred", "darkred", "darkred","darkred","blue", "blue", "blue","blue", "blue", "blue","blue", "blue","blue"),
        main="GSE32719 - boxplots", las=2,cex.axis=0.6)

#dev.off()
# Hierarchical clustering of the "samples" based on
# the correlation coefficients of the expression values
hc = hclust(as.dist(1-cor(data2)))
#pdf("plots/HclustRawData.pdf")
plot(hc, main="GSE32719 - Hierarchical Clustering")

#dev.off()
#######################################
# Differential expression (DE) analysis
#######################################

# Separate each of the conditions into three smaller data frames
young = data2[,1:13]
middle = data2[,14:18]
old = data2[,19:26]

YOdata=cbind(young,old)
YMdata=cbind(young,middle)
MOdata=cbind(middle,old)


# Compute the means of the samples of each condition
young.mean = apply(young, 1, mean)
middle.mean = apply(middle, 1, mean)
old.mean = apply(old,1, mean)

head(young.mean)
##     DDR1     RFC2    HSPA6     PAX8   GUCA1A     UBA7 
## 2.931289 2.850180 2.463136 2.941439 1.777652 3.274266
head(middle.mean)
##     DDR1     RFC2    HSPA6     PAX8   GUCA1A     UBA7 
## 2.995422 2.759273 2.407664 2.894989 1.771068 3.250103
head(old.mean)
##     DDR1     RFC2    HSPA6     PAX8   GUCA1A     UBA7 
## 2.919804 2.731692 2.374850 2.898774 1.838715 3.294212
# Just get the maximum of all the means
limit = max(young.mean, middle.mean,old.mean)

# Scatter plots of young vs old and young vs middle aged
#pdf("plots/comparisonConditions.pdf")
par(mfrow=c(2,3))
plot(young.mean ~ old.mean, xlab = "young", ylab = "old",
     main = "GSE32719 - Scatter", xlim = c(0, limit), ylim = c(0, limit))
# Diagonal line
abline(0, 1, col = "red")

######compare young and middle##############
plot(young.mean ~ middle.mean, xlab = "young", ylab = "middle",
     #main = "GSE32719 - Scatter", 
     xlim = c(0, limit), ylim = c(0, limit))
# Diagonal line
abline(0, 1, col = "blue")
##########compare middle and old aged patients
plot(middle.mean ~ old.mean, xlab = "middle", ylab = "old",
     #main = "GSE32719 - Scatter", 
     xlim = c(0, limit), ylim = c(0, limit))
# Diagonal line
abline(0, 1, col = "green")
# Compute fold-change (biological significance)
# Difference between the means of the conditions
#par(mfrow=c(1,3))
foldYO = old.mean - young.mean

# Histogram of the fold differences
hist(foldYO, col = "red")

## between M vs Y ####
foldYM = middle.mean - young.mean

# Histogram of the fold differences
hist(foldYM, col = "blue")

## between O vs M ####
foldMO = old.mean - middle.mean

# Histogram of the fold differences
hist(foldMO, col = "green")

#dev.off()
# Compute statistical significance (using t-test)
pvalueYO = NULL # Empty list for the p-values
tstatYO = NULL # Empty list of the t test statistics
pvalueYM = NULL # Empty list for the p-values
tstatYM = NULL # Empty list of the t test statistics
pvalueMO = NULL # Empty list for the p-values
tstatMO = NULL # Empty list of the t test statistics

for(i in 1 : nrow(data)) { # For each gene : 
  Y = young[i,] # WT of gene number i
  M = middle[i,] # KO of gene number i
  O = old[i,]
  # Compute t-test between the bi- conditions
  tYO = t.test(Y, O)
  
  tYM = t.test(Y, M)
  
  tMO = t.test(M, O)
  
  # Put the current p-value in the pvalues list
  pvalueYO[i] = tYO$p.value
  # Put the current t-statistic in the tstats list
  tstatYO[i] = tYO$statistic
  
  # Put the current p-value in the pvalues list
  pvalueYM[i] = tYM$p.value
  # Put the current t-statistic in the tstats list
  tstatYM[i] = tYM$statistic
  
  # Put the current p-value in the pvalues list
  pvalueMO[i] = tMO$p.value
  # Put the current t-statistic in the tstats list
  tstatMO[i] = tMO$statistic
}

head(pvalueYO)
## [1] 0.840425708 0.004623247 0.056844064 0.128518348 0.332272408 0.427825292
head(pvalueYM)
## [1] 0.03335770 0.03992604 0.18210211 0.06058612 0.85200750 0.56131090
head(pvalueMO)
## [1] 0.1963906 0.3939766 0.2645733 0.8814252 0.2668504 0.2989935
#pdf("plots/pvalueByConditionHist.pdf")
par(mfrow=c(3,1))
# Histogram of p-values (-log10)
hist(-log10(pvalueYO), col = "red")
hist(-log10(pvalueYM), col = "blue")
hist(-log10(pvalueMO), col = "green")

#dev.off()

# Volcano: put the biological significance (fold-change)
# and statistical significance (p-value) in one plot
#pdf("plots/foldYO_volcano_I.pdf")
plot(foldYO, -log10(pvalueYO), main = "GSE32719 YO conditions - Volcano")
#dev.off()

fold_cutoff_YO = log2(1.2)
pvalue_cutoff_YO = 0.01

# Screen for the genes that satisfy the filtering criteria

# Fold-change filter for "biological" significance
filter_by_fold_YO = abs(foldYO) >= fold_cutoff_YO
dim(YOdata[filter_by_fold_YO, ])
## [1] 237  21
# P-value filter for "statistical" significance
filter_by_pvalue_YO = pvalueYO <= pvalue_cutoff_YO
dim(YOdata[filter_by_pvalue_YO, ])
## [1] 649  21
# Combined filter (both biological and statistical)
filter_combined_YO = filter_by_fold_YO & filter_by_pvalue_YO

filtered_YO = YOdata[filter_combined_YO,]
dim(filtered_YO)
## [1] 103  21
head(filtered_YO)
##           GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993 GSM812994
## STMN1      2.199005  2.227024  2.688487  2.748846  2.607545  2.045028  2.579443
## NKX2-3     2.356841  2.173925  2.531269  2.381756  2.696276  2.915205  2.018846
## ZNF483     2.340337  2.076402  2.843332  2.709023  2.576160  2.355085  2.506713
## CNNM2      2.215005  2.143364  2.134958  2.524655  1.997714  2.223072  2.278998
## SLC1A6     1.872153  1.887452  2.409534  2.291756  2.795033  2.906294  2.105433
## CDC42-IT1  2.705048  2.828832  2.478477  2.625209  2.773800  2.557121  2.362751
##           GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000 GSM813006
## STMN1      2.762775  2.727239  2.527133  2.589342  2.619882  2.557248  2.263959
## NKX2-3     2.666570  2.837494  2.648280  2.715775  2.448178  2.681903  2.897596
## ZNF483     2.623641  2.501987  2.764130  2.921756  2.885695  2.783556  2.987007
## CNNM2      2.747677  1.894273  2.097847  1.953121  2.283668  2.307014  2.095508
## SLC1A6     2.962646  2.632502  2.638485  2.720436  2.419948  1.806488  2.576278
## CDC42-IT1  2.032072  2.272670  2.813177  2.717340  2.374196  2.941816  3.077937
##           GSM813007 GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013
## STMN1      1.941420  1.861400  2.124118  1.971108  2.220143  2.143900  2.132806
## NKX2-3     2.880764  2.704343  3.232080  3.145029  2.778223  2.776166  3.048084
## ZNF483     2.895563  3.146203  2.644492  3.182856  3.036737  3.004917  3.015023
## CNNM2      1.930292  1.734418  1.925211  1.871978  1.987162  2.009395  1.976887
## SLC1A6     2.980219  2.485324  2.832418  2.893697  2.850576  3.027672  2.905034
## CDC42-IT1  2.733769  2.983628  2.645847  3.080920  3.141138  2.947566  2.698981
# Let's generate the volcano plot again,
# highlighting the significantly differential expressed genes
#pdf("plots/foldYO_volcanoYO)_II.pdf")
plot(foldYO, -log10(pvalueYO), main = "GSE32719 YO conditions - Volcano #2")
points (foldYO[filter_combined_YO], -log10(pvalueYO[filter_combined_YO]),
        pch = 16, col = "red")
#dev.off()

# Highlighting up-regulated in red and down-regulated in blue
#pdf("plots/foldYO_volcanoYO_III.pdf")
plot(foldYO, -log10(pvalueYO), main = "GSE32719 YO conditions - Volcano #3")
points (foldYO[filter_combined_YO & foldYO < 0],
        -log10(pvalueYO[filter_combined_YO & foldYO < 0]),
        pch = 16, col = "blue")
points (foldYO[filter_combined_YO & foldYO > 0],
        -log10(pvalueYO[filter_combined_YO & foldYO > 0]),
        pch = 16, col = "red")
legend("topleft",c("NO","up", "down"),cex=.8,
       col=c("black","red","blue"),pch=c(21,19,19),title="Genes")

#dev.off()

upRegulated_DEGs_YO = foldYO[filter_combined_YO & foldYO > 0]
length(upRegulated_DEGs_YO)
## [1] 70
downRegulated_DEGs_YO =  foldYO[filter_combined_YO & foldYO < 0]
length(downRegulated_DEGs_YO)
## [1] 33
#write.csv(names(upRegulated_DEGs_YO), "up_YO.csv")
#write.csv(names(downRegulated_DEGs_YO), "down_YO.csv")


# Cluster the rows (genes) & columns (samples) by correlation
rowvYO = as.dendrogram(hclust(as.dist(1-cor(t(filtered_YO)))))
colvYO = as.dendrogram(hclust(as.dist(1-cor(filtered_YO))))

#pdf("plots/sampleHeatmap_YO.pdf")

heatmap.2(filtered_YO, Rowv=rowvYO,Colv=colvYO, 
          col = rev(greenred(20*10)), cexRow=0.8,
          cexCol=0.8,scale="row",notecol="black",
          margins=c(5,5), # ("margin.Y", "margin.X")
          trace='none', 
          symkey=FALSE, 
          symbreaks=FALSE, 
          dendrogram='none',
          density.info='histogram', 
          denscol="black",
          keysize=1, 
          #( "bottom.margin", "left.margin", "top.margin", "left.margin" )
          key.par=list(mar=c(3.5,0,3,0)))

# lmat -- added 2 lattice sections (5 and 6) for padding
#lmat=rbind(c(3, 2, 1), c(6, 1, 3)), lhei=c(2,3), lwid=c(1, 3, 1))
#library(gplots)
#dev.off()

saveRDS(filtered_YO,"youngOld.RDS")

# Save the DE genes to a text file
#write.csv(filtered_YO, "GSE32719_YO_DE.csv") #sep = "\t",
#quote = FALSE)
#############################################################################
########## YM conditions ################

# Volcano: put the biological significance (fold-change)
# and statistical significance (p-value) in one plot
#pdf("plots/foldYM_volcanoYM_I.pdf")
plot(foldYM, -log10(pvalueYM), main = "GSE32719 YM conditions - Volcano")

#dev.off()


fold_cutoff_YM = log2(1.2)
pvalue_cutoff_YM = 0.01

# Screen for the genes that satisfy the filtering criteria

# Fold-change filter for "biological" significance
filter_by_fold_YM = abs(foldYM) >= fold_cutoff_YM
dim(YMdata[filter_by_fold_YM, ])
## [1] 723  18
# P-value filter for "statistical" significance
filter_by_pvalue_YM = pvalueYM <= pvalue_cutoff_YM
dim(YMdata[filter_by_pvalue_YM, ])
## [1] 1699   18
# Combined filter (both biological and statistical)
filter_combined_YM = filter_by_fold_YM & filter_by_pvalue_YM

filtered_YM = YMdata[filter_combined_YM,]
dim(filtered_YM)
## [1] 291  18
head(filtered_YM)
##         GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993 GSM812994
## ARL11    2.253317  2.360007  1.928407  1.612269  1.630158  1.704614  1.507577
## KLHDC1   2.754379  2.578578  2.943292  2.836876  2.621134  2.539873  2.139900
## TMX3     1.997022  2.112761  2.467787  2.056554  2.374235  1.942973  1.770114
## EFCAB13  1.740540  1.754988  1.734457  1.856537  2.055267  1.966144  1.768582
## ZNF483   2.340337  2.076402  2.843332  2.709023  2.576160  2.355085  2.506713
## ABCD3    2.578995  2.460471  2.595698  2.634902  2.260064  2.306108  2.136450
##         GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000 GSM813001
## ARL11    1.557133  1.739600  2.482342  2.352407  2.511399  2.817176  2.739934
## KLHDC1   2.468662  2.258023  3.212565  2.992187  3.079403  2.987093  3.035587
## TMX3     2.081982  1.855680  2.000743  2.301467  2.552403  1.978050  2.336742
## EFCAB13  1.794697  1.792174  2.278635  2.176460  1.766549  1.812122  2.205928
## ZNF483   2.623641  2.501987  2.764130  2.921756  2.885695  2.783556  2.791046
## ABCD3    2.841588  2.531437  2.167222  2.322367  2.620507  2.355732  2.001314
##         GSM813002 GSM813003 GSM813004 GSM813005
## ARL11    2.598942  2.460305  2.842819  2.730768
## KLHDC1   3.051546  2.973761  2.950723  3.166616
## TMX3     2.349820  2.283255  2.543769  2.466596
## EFCAB13  2.425393  2.323305  2.179338  2.163854
## ZNF483   2.911935  2.853945  3.008140  3.019280
## ABCD3    2.282001  2.253844  2.159918  2.218047
# Let's generate the volcano plot again,
# highlighting the significantly differential expressed genes
#pdf("plots/foldYM_volcanoYM_II.pdf")
plot(foldYM, -log10(pvalueYM), main = "GSE32719 YM conditions - Volcano #2")
points (foldYM[filter_combined_YM], -log10(pvalueYM[filter_combined_YM]),
        pch = 16, col = "red")

#dev.off()


# Highlighting up-regulated in red and down-regulated in blue
#pdf("plots/foldYM_volcanoYM_III.pdf")
plot(foldYM, -log10(pvalueYM), main = "GSE32719 YM conditions - Volcano #3")
points (foldYM[filter_combined_YM & foldYM < 0],
        -log10(pvalueYM[filter_combined_YM & foldYM < 0]),
        pch = 16, col = "blue")
points (foldYM[filter_combined_YM & foldYM > 0],
        -log10(pvalueYM[filter_combined_YM & foldYM > 0]),
        pch = 16, col = "red")
legend("topleft",c("NO","down", "up"),cex=.8,
       col=c("black","blue","red"),pch=c(21,19,19),title="Genes")

#dev.off()

upRegulated_DEGs_YM  = foldYM[filter_combined_YM & foldYM > 0]
length(upRegulated_DEGs_YM)
## [1] 232
downRegulated_DEGs_YM =  foldYM[filter_combined_YM & foldYM < 0]
length(downRegulated_DEGs_YM)
## [1] 59
# Cluster the rows (genes) & columns (samples) by correlation
rowvYM = as.dendrogram(hclust(as.dist(1-cor(t(filtered_YM)))))
colvYM = as.dendrogram(hclust(as.dist(1-cor(filtered_YM))))

#Generate a heatmap
#pdf("plots/heatmap_filtered_YM.pdf")
heatmap(filtered_YM, Rowv=rowvYM, Colv=colvYM, cexCol=0.7)

#dev.off()



#pdf("plots/sampleHeatmap_YM.pdf")
heatmap.2(filtered_YM, Rowv=rowvYM,Colv=colvYM, 
          col = rev(greenred(20*10)), 
          cexCol=0.8,scale="row",notecol="black",
          margins=c(5,5), # ("margin.Y", "margin.X")
          trace='none', 
          symkey=FALSE, 
          symbreaks=FALSE, 
          dendrogram='none',
          density.info='histogram', 
          denscol="black",
          keysize=1, 
          key.par=list(mar=c(3.5,0,3,0)))

#dev.off()

heatmap.2(filtered_YM, Rowv=rowvYM, Colv=colvYM, cexCol=0.7,
          col = rev(redblue(256)), scale = "row")

#dev.off()

saveRDS(filtered_YM,"youngMiddle.RDS")
# Save the DE genes to a text file
#write.csv (filtered_YM, "GSE32719_YM_DE.csv")#, sep = "\t",
#quote = FALSE)

#write.csv(names(upRegulated_DEGs_YM), "up_YM.csv")
#write.csv(names(downRegulated_DEGs_YM), "down_YM.csv")
#############################################################################
########## MO conditions ################

# Volcano: put the biological significance (fold-change)
# and statistical significance (p-value) in one plot
#pdf("plots/foldMO_volcanoMO_I.pdf")
plot(foldMO, -log10(pvalueMO), main = "GSE32719 MO conditions - Volcano")

#dev.off()

fold_cutoff_MO = log2(1.2)
pvalue_cutoff_MO = 0.01


# Screen for the genes that satisfy the filtering criteria

# Fold-change filter for "biological" significance
filter_by_fold_MO = abs(foldMO) >= fold_cutoff_MO
dim(MOdata[filter_by_fold_MO, ])
## [1] 540  13
# P-value filter for "statistical" significance
filter_by_pvalue_MO = pvalueMO <= pvalue_cutoff_MO
dim(MOdata[filter_by_pvalue_MO, ])
## [1] 311  13
# Combined filter (both biological and statistical)
filter_combined_MO = filter_by_fold_MO & filter_by_pvalue_MO

filtered_MO = MOdata[filter_combined_MO,]
dim(filtered_MO)
## [1] 64 13
head(filtered_MO)
##           GSM813001 GSM813002 GSM813003 GSM813004 GSM813005 GSM813006 GSM813007
## STMN1      2.447901  2.315305  2.383504  2.430237  2.347003  2.263959  1.941420
## RPL32P3    2.948676  2.691774  2.759191  2.909474  2.873412  2.828554  2.401848
## PCBP1-AS1  3.064795  2.795942  2.740716  2.743369  2.811181  2.833281  2.453565
## IRAK3      2.783460  2.936768  2.640736  2.913213  2.739505  2.555179  2.525516
## HECTD1     2.754573  2.816688  2.733798  2.831142  2.653255  2.669166  2.197340
## AHR        2.867518  3.080271  3.115500  3.116647  3.204473  3.179465  2.762232
##           GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013
## STMN1      1.861400  2.124118  1.971108  2.220143  2.143900  2.132806
## RPL32P3    2.347996  2.361257  2.505445  2.875283  2.601851  2.579124
## PCBP1-AS1  2.333555  2.082120  2.329325  2.664067  2.254540  2.585855
## IRAK3      2.549802  2.147624  2.278666  2.598872  2.611929  2.794008
## HECTD1     2.151519  2.267669  2.540697  2.688946  2.622837  2.646879
## AHR        2.362739  2.596570  2.673025  2.751306  2.790130  2.748587
# Let's generate the volcano plot again,
# highlighting the significantly differential expressed genes
#pdf("plots/foldMO_volcanoMO_II.pdf")
plot(foldMO, -log10(pvalueMO), main = "GSE32719 MO conditions - Volcano #2")
points (foldMO[filter_combined_MO], -log10(pvalueMO[filter_combined_MO]),
        pch = 16, col = "red")

#dev.off()


# Highlighting up-regulated in red and down-regulated in blue
#pdf("plots/foldMO_volcanoMO_III.pdf")
plot(foldMO, -log10(pvalueMO), main = "GSE32719 MO conditions - Volcano #3")
points (foldMO[filter_combined_MO & foldMO < 0],
        -log10(pvalueMO[filter_combined_MO & foldMO < 0]),
        pch = 16, col = "blue")
points (foldMO[filter_combined_MO & foldMO > 0],
        -log10(pvalueMO[filter_combined_MO & foldMO > 0]),
        pch = 16, col = "red")
legend("topleft",c("NO","down", "up"),cex=.8,
       col=c("black","blue","red"),pch=c(21,19,19),title="Genes")

#dev.off()

upRegulated_DEGs_MO = foldMO[filter_combined_MO & foldMO > 0]
length(upRegulated_DEGs_MO)
## [1] 25
downRegulated_DEGs_MO =  foldMO[filter_combined_MO & foldMO < 0]
length(downRegulated_DEGs_MO)
## [1] 39
# Cluster the rows (genes) & columns (samples) by correlation
rowvMO = as.dendrogram(hclust(as.dist(1-cor(t(filtered_MO)))))
colvMO = as.dendrogram(hclust(as.dist(1-cor(filtered_MO))))

# Generate a heatmap
#pdf("plots/heatmap_filtered_MO.pdf")
heatmap(filtered_MO, Rowv=rowvMO, Colv=colvMO, cexCol=0.7)

#dev.off()

#pdf("plots/sampleHeatmap_MO.pdf")
heatmap.2(filtered_MO, Rowv=rowvMO,Colv=colvMO, 
          col = rev(greenred(20*10)), 
          cexCol=0.8,scale="row",notecol="black",
          margins=c(5,5), # ("margin.Y", "margin.X")
          trace='none', 
          symkey=FALSE, 
          symbreaks=FALSE, 
          dendrogram='none',
          density.info='histogram', 
          denscol="black",
          keysize=1, 
          key.par=list(mar=c(3.5,0,3,0)))

#dev.off()

# Enhanced heatmap
#pdf("plots/heatmap_filtered_MO.pdf")
heatmap.2(filtered_MO, Rowv=rowvMO, Colv=colvMO, cexCol=0.7,
          col = rev(redblue(256)), scale = "row")

#dev.off()

saveRDS(filtered_MO,"middleOld.RDS")

# Save the DE genes to a text file
#write.csv (filtered_MO, "GSE32719_MO_DE.csv")#, sep = "\t",
#quote = FALSE)

#write.csv(names(upRegulated_DEGs_MO),"up_MO.csv")
#write.csv(names(downRegulated_DEGs_MO),"down_MO.csv")
#############################################################################